In [ ]:
import pickle;
from imp import reload
import numpy as np; import pandas as pd
import lightgbm as lgb; import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import constants, utils, inference, evaluation
from joblib import Parallel, delayed
import multiprocessing
import os
pd.options.mode.chained_assignment = None
In [2]:
def generate_pred(m, idx, is_sub=False):
'''
m: model path
idx: index of model
is_sub: bool indicator for submission
'''
print('Evaluating Model {} ...'.format(idx))
print('Model Path {}'.format(m))
bst = pickle.load(open(m, 'rb'))
feat = data[utils.get_feat_col(bst)] # data global variable
pred = utils.get_predition(bst, feat)
user_product = gid[['user_id', 'product_id', 'order_id']] # gid global variable
user_product['score'] = pred
if is_sub is False:
user_product['label'] = label
auc = roc_auc_score(label, pred) # label global variable
print('Evaluation AUC {}'.format(auc))
op = user_product.copy()
op = utils.tarbox_f1_optim(op, low_bound=0)
op['products'] = op['products'].apply(lambda x: [int(i) if i != 'None' else i for i in x.split()])
op = pd.merge(pd.DataFrame({'order_id':user_product.order_id.unique()}),
op, on = ['order_id'], how = 'left')
gold = evaluation.get_gold(user_product)
res = evaluation.evaluation(gold, op[['order_id', 'products']])
mf1 = res.f1score.mean()
print('F1 Optimization Result: mean-f1-score {}'.format(mf1))
eval_res= {'model_file':m.split('/')[-1], 'eval_auc': auc, 'eval_mf1': mf1}
return eval_res, pred
else:
return pred
In [5]:
data = pd.read_hdf('/data/Instacart/test.h5')
In [6]:
orders = data[['order_id']].drop_duplicates()
up_pair = data[['order_id', 'product_id']].drop_duplicates()
gid = data[constants.ID_COLS]
In [7]:
bagging_tree = pd.read_hdf(constants.EVA_DATA_DIR + 'bagging_tree.h5')
bagging_tree = bagging_tree.sort_values('eval_mf1')
In [8]:
bagging_tree.reset_index()
Out[8]:
In [9]:
# 11mins
pred_subs = []
for idx,m in enumerate(bagging_tree.model_file.values):
fp = './submission/' + m.split('/')[-1] + 'pkl'
if os.path.exists(fp):
pred = pickle.load(open(fp, 'rb'))
else:
pred = generate_pred(m, idx, is_sub=True)
with open('./submission/' + m.split('/')[-1] + 'pkl', 'wb') as f:
pickle.dump(pred, f, pickle.HIGHEST_PROTOCOL)
pred_subs.append(pred)
In [29]:
level0 = np.median(pred_subs[0:5], axis=0) # 0.4034
level1 = np.median([level0] + pred_subs[5:9], axis=0) # 0.40429
level2 = np.median([level1] + pred_subs[9:14], axis=0)
In [13]:
aboretum_pred = pd.merge(gid[['order_id', 'product_id']],
pd.read_csv('./submission/prediction_arboretum.csv'),
on=['order_id', 'product_id'], how='left')
In [14]:
big_model_pred = pd.merge(gid[['order_id', 'product_id']],
pd.read_csv('./submission/xgb_score_0.8406880155364034.csv'),
on=['order_id', 'product_id'], how='left')
big_model_pred.score.fillna(0, inplace = True)
In [25]:
big_lgb_dart = pd.read_csv('./submission/big_lgb_dart_0.8386003614599506.csv')
In [34]:
predictions = [level2, pred_subs[14], pred_subs[15],
aboretum_pred.prediction.values,
big_model_pred.score.values]
In [35]:
user_product = data[['user_id', 'product_id', 'order_id']]
user_product['score'] = np.median(predictions, axis=0)
In [ ]:
op = user_product.copy()
op = utils.shing_f1_optim(op, low_bound=0.01, topk=200)
op = pd.merge(op[['order_id', 'products']], orders[['order_id']], on=['order_id'], how='right')
op.columns = ['order_id', 'products']
for row in op.loc[op.products.isnull(), 'products'].index:
op.at[row, 'products'] = 'None'
op[['order_id', 'products']].to_csv('./submission/lgb3_big_bag_shing.csv', index=False)
In [37]:
op = user_product.copy()
op = utils.tarbox_f1_optim(op, low_bound=0)
op = pd.merge(op[['order_id', 'products']], orders[['order_id']], on=['order_id'], how='right')
op.columns = ['order_id', 'products']
for row in op.loc[op.products.isnull(), 'products'].index:
op.at[row, 'products'] = 'None'
op[['order_id', 'products']].to_csv('./submission/final_bag1.csv', index=False)
In [ ]: